require(quanteda)

load("data/UNGD.RData")
colnames(ungd_files) <- char_tolower(colnames(ungd_files))
ungd_files$docname <- rownames(ungd_files)

corp <- corpus(ungd_files)
saveRDS(corp, "data/data_corpus.RDS")

corp_sent <- corpus_reshape(corp, to = "sentences")
dat_human <- read.csv("data/HumanCoding.csv", stringsAsFactors = FALSE)
docvars(corp_sent, "topic_human") <- dat_human$coding[match(docnames(corp_sent), dat_human$doc_id)]
docvars(corp_sent, "docname") <- factor(paste(docvars(corp_sent, "country"), 
                                        docvars(corp_sent, "session"),
                                        docvars(corp_sent, "year"), sep = "_"))
saveRDS(corp_sent, "data/data_corpus_sent.RDS")

toks_sent <- tokens(corp_sent, remove_numbers = TRUE, remove_punct = TRUE)
saveRDS(toks_sent, "data/data_tokens_sent.RDS")
